Transcriptional Factor analysis in neighborhood Zone (TFinZONE)

General Information of this Script

                                       ## **Aim of the Script** 

######### Transcriptional Factor analysis in neighborhood Zone (TFinZONE) ###############################################
##### This Script aims to analyze the DNA-Motifs associated with the chromatin States areas as scRNA-seq data. Instead of an expression of genes, we propose an enrichment score. The multiplication of the motif score on the Peak and the log10-Pvalue of the motif in the chromatin state of interest.
## Input files
## 

## **Input files**
   ##### 08MOTIFS_on_PEAKS
      - Folder containing all the output results from homer motifs of all Peaks files used in the program.

   ##### 08TSS_Enh_w_04dpci_fHeatmap_ALL.xlsx
      - Matrix file, Containing the log10Pvalue from each motifs.
    
  ## **Output files**
    #### 03OUTPUT
        - Folder containing all the plots used in figure 4. As well as additional output from the Script

Set working directories and Project name

 workdir = "./"
setwd(workdir)

PTHA="../03OUTPUT/"
dir.create(PTHA)
PROJECT="03a_TFHOOD_at_04dpci"

PORT=paste(PTHA,PROJECT,"/",sep="")
PTHA1=paste(PTHA,PROJECT,"/",sep="")

dir.create(PTHA)
dir.create(PORT)
dir.create(PTHA1)
TRY="_"
TRY1="02Ia_"
PVALUE=0.05
#### FOR VOLCANO ###
upcol<- "#B2182B" # magenta from PiyG
nc<- "#000000" # black
downcol<- "grey" # green from PiyG
CC= c(downcol, "#F7F7F7",upcol)
CO3<- c("lightgrey","#B2182B")
CO_ALU3=c("#d7191c","#d8b365","#542788","grey", "#91bfdb", "grey","grey")

NAME33<- c("Mo_Name","Consensus","P-value","Log_P-value","q-value_Benja","Nor_of_Tar","Per_of_Tar","No_of_Tar_Backg","Per_of_Tar_Backg")
NAME34<- c("HOmer_NAME",NAME33)
mat_m<- data.frame("mask_500_FIND")
## Here you write the Path to the 
WORK1="../01DATA_ORI/08MOTIFS_on_PEAKS/"

#WORK1="Y:/002ZF/000FIGURES_ANALY/00GITHUB_01HIS_Chroma_Factors/01HIS_Chrom_Factors_TEST/04MOtifs_Analy_as_SC_MOAC_Fig3_5/02Fig4_MO_MOAC_04dpci/01DATA_ORI/08MOTIFS_on_PEAKS/"
NAME_G2 <-data.frame(list.files(path=WORK1,pattern="*_FIND"))

colnames(NAME_G2)<- "ENH4"
#NAME_G2$GROUP <- gsub("CORR1_01d_", "", NAME_G2$ENH4)
NAME_G2$GROUP <- gsub("*_mask_500_FIND", "", NAME_G2$ENH4)
### this is important to check the SYMBOL
NAME_G22<- data.frame(c("00A_Ia","01Ia_A","02U_Ea","03U_Ia","04U_Pa"))


NAME_G2c<- as.character(t(NAME_G2[,2]))

#WORK11A11="../01DATA_ORI/07aMO_forMOAC/08TSS_Enh_w_04dpci_fHeatmap_ALL.xlsx"
WORK11A11="../01DATA_ORI/07aMO_forSeurat_ANALY/08TSS_Enh_w_04dpci_fHeatmap_ALL.xlsx"

MAT_1A= read_excel(WORK11A11)
colnames(MAT_1A)[1]<- "SYMBOL"

dim(MAT_1A)
## [1] 338  11
##### Set resolution for CLustering
RES=0.5

Set processors

## sequential:
## - args: function (..., envir = parent.frame())
## - tweaked: FALSE
## - call: NULL
## sequential:
## - args: function (..., workers = 16, envir = parent.frame())
## - tweaked: TRUE
## - call: plan("sequential", workers = 16)

Load motifs to make the name in the matrix

gather the matrix from all the TFs found in this stage

MERGE MOTIFS vs SIG_MO from ALL MOTIFS

## 
## 00A_Ia_04dcpi 01Ia_A_04dcpi 02U_Ea_04dcpi 03U_Ia_04dcpi 04U_Pa_04dcpi 
##         39886        291670         14945         35034         37650
## [1] 3

AS SINGLE CELL PREPARE MATRIX

Create a Seurat object

b7<- AC_RE_VOL_SC_META
rownames(b7)<- b7$PositionID2

mat_c1<- mat_cm_TMM_BOTH_SC[,rownames(b7)]
rownames(mat_c1)<- mat_cm_TMM_BOTH_SC$SYMBOL
dim(mat_c1)
## [1]  240 4647
dim(b7)
## [1] 4647    5
smartseq2 <- CreateSeuratObject(mat_c1,project = "04d", meta.data = b7)

Identification of highly variable features

## png 
##   2

PLot from TOP TFs bound in the data

arrangeQC_TF <- ggarrange(EC00,EC001, ncol = 2,nrow =1,common.legend = T, align = c("hv"),legend="none")

print(arrangeQC_TF)

grid.arrange(  plot1,plot2, ncol = 1,nrow = 2)

Scaling the data

all.genes <- rownames(smartseq2)
smartseq2 <- ScaleData(smartseq2, features = all.genes,model.use    
="linear")

Perform linear dimensional reduction

smartseq2 <- RunPCA(smartseq2, features = VariableFeatures(object = smartseq2))
VizDimLoadings(smartseq2, dims = 1:4, reduction = "pca")

Plot PCA

PLot Pincipal component analysis (PCA)

grid.arrange(  P1, ncol = 1,nrow = 1)

Plot heatmap from 200 Peaks

PLot Pincipal component analysis CA from TOP TFs bound in the data

print(P2)

PCA and checking of the PC which is similar to say how many cluster of cells are in the data

## png 
##   2

PLot to identify how many cluster in the Data

#grid.arrange(  P3, ncol = 1,nrow = 1)

Perform Unsupervised Clusters

set.seed(1)
smartseq2 <- FindNeighbors(smartseq2, dims = 1:10)
smartseq2 <- FindClusters(smartseq2, resolution = RES, random.seed= 1, algorithm=1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4647
## Number of edges: 146524
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8811
## Number of communities: 12
## Elapsed time: 0 seconds

#########here

Plot UMAP to find Neighborg to use for the DATA

## png 
##   2
## png 
##   2
Plot UMAP and TSNE PLots
print(arrange1t)

print(arrange1)

print(arrange1bt)

print(arrange1c)

print(arrange1cat)

## png 
##   2

Plot UMAP and TSNE PLots by GROUP (Divergent areas)

print(arrange3)

print(arrange3t)

print(arrange2)

Find markers for every cluster 0% in clusters and 0.20 fc

Plot Barplot, number of peaks per cluster and Proteins per Divergent areas

grid.arrange(  P20,P201, ncol = 1,nrow = 2)

Heatmap of the top 10 markers after the Binomil TEST

Plot Barplot, number of peaks per cluster and Proteins per Divergent areas

grid.arrange(  P05ah_bino1, ncol = 1,nrow = 1)

grid.arrange(  P05ah_bino, ncol = 1,nrow = 1)

save output after UMAP

Extract analized matrix from Seurat objects

DATA_SC<- data.frame(smartseq2@assays[["RNA"]]@counts@i)
DATA_SC<- data.frame(smartseq2@assays[["RNA"]]@counts@i)
DATA_SC<- GetAssay(smartseq2,assay = "RNA")
DATA_SC1<- data.frame(DATA_SC@scale.data)
DATA_SC2<- data.frame(DATA_SC@scale.data)
#dim(DATA_SC1)
META_SEU<- data.frame(smartseq2@meta.data)
colnames(DATA_SC1)<- META_SEU$PositionID
colnames(META_SEU)[4]<- "SYMBOL_TF"

Selection of candidate genes from 04dpci marker list. This is performed after checking the output excel file

GENES001<- c("CEBP","Cdx2", "Nanog","Smad2","Smad4","Sox9","Sox10","Mef2c","Gata4")
GENES002 <- c("Eomes","FOXA1",  "FOXM1","CLOCK","MITF", "MafA", "Mef2b")
GENES003 <- c("Eomes","Oct4","Foxh1","CUX1","Bapx1", "Elk4","Foxo1","Fli1")
GENES004<- c("Rbp7","Rgcc", "Fabp4","Egfl7","Flt1", "Cd36","Nrp1","Tpm1", "Rbp1")
GENES005<- c("Mfap5","Dcn","Mgp","Cfh","Serpine2","Mt1","Gpx3","Rbp1","Rcn3")

VIolin plot by Regulatory type (Ea, Ia, Pa)

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[8]]
## 
## [[9]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## png 
##   2
## png 
##   2

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

print(arrange12)

Plot several plot from genes in each cluster

## png 
##   2
## png 
##   2

UMAP from the selected genes

print(P05a)

print(P05b)

print(P05c)

TSNE from the selected genes

print(P05a1)

print(P05b1)

print(P05c1)

VIolin plot by Clusters found in the script

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

Violin PLot of selected Markers

## png 
##   2
## png 
##   2
print(arrange1CM41)

print(arrange1CM42)

print(arrange1CM43)